Exploratory Data Analysis

In this notebook, we provide insight into data by creating tables and plots.

Import Libraries
In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from IPython.display import display
from wordcloud import WordCloud
from collections import Counter
from typing import Tuple, List
import folium  
import folium.plugins as plugins

def set_pandas_display_options() -> None:
    """Set pandas display options."""
    display = pd.options.display
    display.max_columns = 500
    display.max_rows = 500
    display.max_colwidth = 100
    display.width = None

set_pandas_display_options()

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
Import Dataset
In [2]:
# Import Dataset
with open('Nevada.pkl', 'rb') as nevada:
    Nevada = pickle.load(nevada)
In [3]:
display(Nevada.head(3))
review_id user_id business_id review_stars text date business_name address city state postal_code latitude longitude rating attributes categories user_name average_stars LemaText cuisines input_text
0 izOSwMP2js_ptjDQZsynig KriIEvoyWwhoswBoqqUpzA faPVqws-x-5k2CQKDNtHxw 5 We just recently returned from Las Vegas and had the pleasure of stopping by your restaurant on ... 2019-12-13 15:34:17 Yardbird Southern Table & Bar 3355 Las Vegas Blvd S LAS VEGAS NV 89109 36.122328 -115.170112 4.5 {'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'True', 'OutdoorSeating': 'False', 'Restau... restaurants, american (new), southern, nightlife, bars, cocktail bars Christine 5.00 return las pleasure stop restaurant lunch seat lounge amaze server order lunch long cold server ... american return las pleasure stop restaurant lunch seat lounge amaze server order lunch long cold server ...
1 qUxCvPEkl7xrmY-n1szciA D3XxyNOy8b_1484Oi1eYOg VrGI7_nRjXpn0415S3coGQ 5 I've been here three times now one of my coworkers introduced me to this place the staff are ver... 2019-12-13 15:32:07 Vegas Noodle House 3516 Wynn Rd LAS VEGAS NV 89103 36.125887 -115.194425 4.0 {'RestaurantsDelivery': 'True', 'GoodForKids': 'True', 'RestaurantsGoodForGroups': 'True', 'Rest... restaurants, vegetarian, thai, noodles Aaron 3.88 time place staff friendly restaurant clean soup delicious area bite eat highly soup thai time place staff friendly restaurant clean soup delicious area bite eat highly soup thai
2 oBbpt5C7BwKaTXy6ylzJ_g cvA8vHPR0Gs0zsPnyv6JEQ dVp1llwjZUmhCF4pNsJnQg 4 When we walked in, we noticed that the decor was nice, and the restaurant was clean. Hubby and I... 2019-12-13 15:29:12 The Modern Vegan 700 E Naples Dr LAS VEGAS NV 89119 36.105993 -115.149127 4.0 {'RestaurantsAttire': ''casual'', 'ByAppointmentOnly': 'False', 'BikeParking': 'True', 'Restaura... vegan, restaurants, breakfast & brunch, american (new), american (traditional) Taheerah 3.89 nice restaurant clean hubby menu heaven order chicken waffle order breakfast burrito honest brea... american nice restaurant clean hubby menu heaven order chicken waffle order breakfast burrito honest brea...
Exploration of Users

Let us take a look on the users by seeing the activity of the most active of them.

In [4]:
user_agg=Nevada.groupby(['user_id','user_name']).agg({'review_id':['count'],'date':['min','max'],'review_stars':['mean']})
user_agg=user_agg.sort_values([('review_id','count')],ascending=False)
print("                                Top 10 Users in Yelp Dataset")
user_agg.head(10)
#x.to_excel(r'top_users.xlsx')
                                Top 10 Users in Yelp Dataset
Out[4]:
review_id date review_stars
count min max mean
user_id user_name
bLbSNkLggFnqwNNzzq-Ijw Stefany 814 2012-05-20 19:50:41 2019-11-19 17:37:28 3.431204
PKEzKWv_FktMm2mGPjwd0Q Norm 505 2008-12-12 02:35:45 2019-11-29 19:22:46 3.613861
UYcmGbelzRa0Q6JqzLoguw Emily 364 2010-11-04 21:55:40 2019-10-15 17:15:31 3.541209
U4INQZOPSUaj8hMjLlZ3KA Michael 314 2008-06-01 02:19:17 2019-12-03 12:24:36 3.821656
JaqcCU3nxReTW2cBLHounA Zachary 309 2016-01-01 21:28:09 2019-11-04 18:50:11 3.770227
_VMGbmIeK71rQGwOBWt_Kg Chris 291 2010-02-03 07:01:13 2019-12-11 03:27:51 3.986254
tH0uKD-vNwMoEc3Xk3Cbdg Cathy 286 2012-08-11 15:01:51 2019-10-13 22:24:41 3.839161
3nIuSCZk5f_2WWYMLN7h3w Lauren 285 2015-06-07 16:02:18 2019-11-05 23:07:54 3.978947
n86B7IkbU20AkxlFX_5aew Jade 271 2010-02-21 08:09:07 2017-11-16 19:30:20 3.675277
C2C0GPKvzWWnP57Os9eQ0w Clint 270 2009-07-06 05:06:54 2018-05-28 04:55:22 3.677778
Exploration of Cities
In [5]:
cities = Nevada.city
# Calculate the number of times the city appears
counts = cities.value_counts()
# Calculate number of unique businesses per city
unique_businesses =  Nevada.groupby('city')['business_id'].nunique()
# Calculate the average stars per city
avg_stars = round(Nevada.groupby('city')['review_stars'].mean(),2)
# Calculate number of average reviews per city
avg_reviews = round((counts/unique_businesses),2)
table=pd.DataFrame({'Number of Reviews':counts,
                    'Average Number of Reviews':avg_reviews,
                    'Number of Businesses':unique_businesses,
                    'Average stars per review':avg_stars}).sort_values(by=['Number of Businesses'], ascending=False)
display(table)
Number of Reviews Average Number of Reviews Number of Businesses Average stars per review
LAS VEGAS 909689 321.44 2830 3.86
HENDERSON 81571 221.66 368 3.83
NORTH LAS VEGAS 18241 116.18 157 3.58
BOULDER CITY 4293 178.88 24 4.22

Las Vegas, in the state of Nevada, has the most businesses followed by Henderson city. It is worth mentioning the high number of reviews that characterize the businesses of Las Vegas. What is more, every city has over 100 reviews average number of reviews per business.

In [6]:
# Create interactive map with default basemap

data=[]
# Rearrange data to suit the format needed for folium
stars_list=list(Nevada['review_stars'].unique())
for star in stars_list:
    subset=Nevada[Nevada['review_stars']==star]
    data.append(subset[['latitude','longitude']].values.tolist())
    
# Initialize at Las Vegas (Google Coordinates)
lat = 36.127430 
lon = -115.138460 
zoom_start=10
print("Las Vegas Review Map")

# basic map
m = folium.Map(location=[lat, lon], tiles="OpenStreetMap", zoom_start=zoom_start)
# Show variations across star ratings 
hm = plugins.HeatMapWithTime(data,max_opacity=0.3,auto_play=True,display_index=True,radius=7)
hm.add_to(m)
m
Las Vegas Review Map
Out[6]:
Exploration of Cuisines
In [7]:
# Find the top-10 cuisines in Nevada regarding their number of reviews
most_common_cuisines = Counter(" ".join(Nevada['cuisines']).split(" ")).most_common()
top10_cuisines = most_common_cuisines[0:10]
display(top10_cuisines)
[('american', 443459),
 ('mexican', 134194),
 ('japanese', 130244),
 ('italian', 115787),
 ('asian', 100142),
 ('chinese', 80310),
 ('korean', 42394),
 ('french', 38635),
 ('thai', 38349),
 ('mediterranean', 33491)]
In [8]:
# Create a barplot to see the distribution of the number of reviews per cuisine.

data = top10_cuisines
names, values = zip(*data) 
ind = np.arange(len(data))  # the x locations for the groups
width = 0.7      # the width of the bars

# Create matplotlib figure
fig = plt.figure(figsize=(18,10),facecolor='#F0F0F0') 
# Create matplotlib axes
ax = fig.add_subplot(111) 

rects1 = ax.bar(ind, values, width, color='#F2583E')

# Add some text for labels, title and axes ticks
ax.set_ylabel('Count')
ax.set_xticks(ind+width/2.)
ax.set_xticklabels(names)

def autolabel(rects):
    # attach some text labels
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 1.0*height,
                '%d' % int(height),
                ha='center', va='bottom')

autolabel(rects1)

# Set properties of gridlines
ax.grid(color='grey', linestyle='-', linewidth=0, alpha=1.0)
# The label of the x-axis
ax.set_xlabel("Cuisines", fontsize=15)
# The label of the y-axis
ax.set_ylabel("# Reviews per Cuisine", fontsize=15)
# The title of the plot
ax.set_title("Top 10 Cuisines Categories Nevada", fontsize=16)
# Set y axes limits
ax.set_ylim([0,470000])
# Set rotation of x-ticks
plt.xticks(rotation=45,fontsize=14)
#plt.savefig("top10cuisines.png")
plt.show()

The american cuisine, followed by the mexican and the japanese, seem to have the most reviews in our dataset.

In [9]:
unique_businesses = Nevada[['business_id','business_name','cuisines']].reset_index(drop=True)
unique_businesses = unique_businesses.drop_duplicates('business_id', keep='last')
unique_businesses.shape
Out[9]:
(3379, 3)
In [10]:
unique_businesses = Nevada[['business_id','business_name','cuisines']].reset_index(drop=True)
unique_businesses = unique_businesses.drop_duplicates('business_id', keep='last')

cat = Counter(" ".join(unique_businesses['cuisines']).split(" ")).most_common()

table = pd.DataFrame(cat, columns =['Cuisines', 'count'])
cuisines = table.Cuisines

count = table['count'].astype(str)
percent100 = (table['count'] / table['count'].sum()).mul(100).round(1).astype(str)# + '%'
cuisines_freq = pd.DataFrame({'cuisines':cuisines,'count':count, 'per100':percent100})

cuisines_freq
Out[10]:
cuisines count per100
0 american 1236 30.2
1 mexican 816 19.9
2 chinese 360 8.8
3 italian 348 8.5
4 japanese 292 7.1
5 asian 252 6.2
6 thai 120 2.9
7 mediterranean 117 2.9
8 hawaiian 107 2.6
9 korean 89 2.2
10 vietnamese 69 1.7
11 ethnic 52 1.3
12 french 51 1.2
13 greek 40 1.0
14 indian 38 0.9
15 spanish 21 0.5
16 caribbean 19 0.5
17 taiwanese 15 0.4
18 pakistani 14 0.3
19 irish 11 0.3
20 brazilian 10 0.2
21 african 9 0.2
22 british 8 0.2
In [11]:
labels = 'american' , 'mexican', 'chinese', 'italian', 'japanese','asian','thai','mediterranean','hawaiian','korean','other'
sizes = [1236,816,360,348,292,252,120,117,107,89,357]
explode = (0.2, 0.2, 0.2, 0.2,0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2)

# Create matplotlib figure
fig = plt.figure(figsize=(18,10),facecolor='#F0F0F0') 
# Create matplotlib axes
ax = fig.add_subplot(111) 

# Creating color parameters 
colors = ( "#F2583E", "#77BED2", "#F7F3E8", "#D5E1DD", "#747E80","#ffba08", "#e09f3e", "#f48c06", "#e85d04", "#dc2f02","#d3d3d3") 

ax.pie(sizes,
        explode = explode,
        labels = labels,
        autopct = '%1.1f%%',
        shadow = False,
        startangle = 100,
        colors=colors, textprops={'fontsize': 13})

ax.axis ('equal')
# The title of the plot
ax.set_title("Percentage Distribution of Top Cuisines", fontsize=15, pad=40)
#plt.savefig("unique_cuisines_pie.png")
plt.show()
Exploration of Reviews
In [12]:
# Take a loon on the most common words that appear in the dataset.
most_common = Counter(" ".join(Nevada['input_text']).split(" ")).most_common()
most_common
Out[12]:
[('food', 809087),
 ('good', 635837),
 ('not', 626137),
 ('place', 548552),
 ('great', 477284),
 ('service', 447625),
 ('american', 443460),
 ('order', 442339),
 ('time', 397114),
 ('restaurant', 233719),
 ('chicken', 219049),
 ('eat', 209126),
 ('best', 203083),
 ('no', 196012),
 ('delicious', 194996),
 ('menu', 176316),
 ('nice', 172791),
 ('table', 172562),
 ('wait', 166668),
 ('love', 162858),
 ('taste', 150789),
 ('friendly', 135246),
 ('mexican', 134197),
 ('staff', 133131),
 ('experience', 132951),
 ('japanese', 130245),
 ('amaze', 123342),
 ('pretty', 123322),
 ('sauce', 121974),
 ('server', 121293),
 ('meal', 119186),
 ('fresh', 118875),
 ('italian', 115787),
 ('people', 115439),
 ('dinner', 112397),
 ('night', 112012),
 ('price', 111646),
 ('bad', 110716),
 ('bar', 110279),
 ('salad', 105587),
 ('recommend', 102566),
 ('cheese', 100916),
 ('asian', 100142),
 ('rice', 98201),
 ('lunch', 97759),
 ('day', 94578),
 ('well', 90807),
 ('lot', 90768),
 ('flavor', 89261),
 ('favorite', 88963),
 ('seat', 88697),
 ('small', 88320),
 ('steak', 87589),
 ('meat', 87342),
 ('excellent', 84771),
 ('beef', 84137),
 ('worth', 83367),
 ('awesome', 83036),
 ('sit', 82999),
 ('breakfast', 82079),
 ('leave', 81848),
 ('super', 81489),
 ('big', 81086),
 ('pizza', 80339),
 ('chinese', 80311),
 ('bit', 78923),
 ('happy', 76174),
 ('location', 76095),
 ('fry', 75846),
 ('quality', 75518),
 ('long', 75241),
 ('check', 75004),
 ('strip', 73792),
 ('soup', 73683),
 ('dish', 73646),
 ('visit', 73232),
 ('hot', 72667),
 ('drink', 72458),
 ('thing', 69156),
 ('shrimp', 68896),
 ('tasty', 68430),
 ('perfect', 67650),
 ('work', 67368),
 ('spicy', 66453),
 ('pork', 65759),
 ('bread', 65654),
 ('come', 64997),
 ('spot', 64410),
 ('bring', 64099),
 ('sweet', 63556),
 ('atmosphere', 63212),
 ('la', 63102),
 ('special', 62813),
 ('hour', 62704),
 ('waiter', 61554),
 ('tell', 60041),
 ('feel', 59747),
 ('area', 59159),
 ('customer', 58611),
 ('waitress', 58243),
 ('fish', 57400),
 ('decide', 57198),
 ('family', 56901),
 ('dessert', 56675),
 ('huge', 56567),
 ('busy', 56152),
 ('clean', 54660),
 ('husband', 54257),
 ('town', 54099),
 ('potato', 53749),
 ('amazing', 52948),
 ('friend', 52854),
 ('fast', 51387),
 ('large', 51045),
 ('attentive', 50359),
 ('inside', 50056),
 ('plate', 49318),
 ('manager', 48966),
 ('sandwich', 48919),
 ('water', 48896),
 ('open', 48316),
 ('room', 47854),
 ('high', 47630),
 ('half', 47597),
 ('star', 47379),
 ('enjoy', 46918),
 ('think', 46413),
 ('free', 46314),
 ('highly', 46042),
 ('pay', 45956),
 ('roll', 45463),
 ('kind', 45096),
 ('house', 44332),
 ('quick', 43473),
 ('fun', 42714),
 ('korean', 42396),
 ('cream', 42390),
 ('point', 42292),
 ('size', 42201),
 ('hard', 41878),
 ('bowl', 41816),
 ('close', 41745),
 ('absolutely', 41537),
 ('buffet', 41251),
 ('serve', 41189),
 ('din', 40964),
 ('decent', 40283),
 ('party', 40280),
 ('review', 40258),
 ('beer', 40113),
 ('wine', 39832),
 ('walk', 39634),
 ('egg', 39541),
 ('fantastic', 39000),
 ('bacon', 38966),
 ('chip', 38791),
 ('ice', 38780),
 ('french', 38653),
 ('cold', 38626),
 ('grill', 38529),
 ('mediterranean', 38389),
 ('thai', 38349),
 ('wife', 38171),
 ('chef', 38030),
 ('ate', 37726),
 ('coffee', 37351),
 ('garlic', 37189),
 ('finally', 36871),
 ('wonderful', 36751),
 ('couple', 36244),
 ('selection', 36205),
 ('extra', 36172),
 ('yelp', 35685),
 ('disappoint', 35370),
 ('return', 35118),
 ('wrong', 34904),
 ('perfectly', 34888),
 ('person', 34860),
 ('tea', 34852),
 ('money', 34713),
 ('yummy', 34516),
 ('crispy', 34383),
 ('hotel', 34315),
 ('finish', 34292),
 ('portion', 34263),
 ('dry', 34188),
 ('live', 33879),
 ('red', 33685),
 ('style', 33447),
 ('group', 33398),
 ('late', 32705),
 ('cool', 32555),
 ('pho', 32387),
 ('lobster', 32332),
 ('care', 32120),
 ('disappointed', 32089),
 ('start', 31950),
 ('kitchen', 31736),
 ('felt', 31670),
 ('crab', 31596),
 ('drive', 31274),
 ('flavorful', 31195),
 ('bite', 31054),
 ('salmon', 30954),
 ('stay', 30677),
 ('music', 30610),
 ('appetizer', 30465),
 ('choose', 30076),
 ('extremely', 29868),
 ('green', 29689),
 ('watch', 29613),
 ('pm', 29409),
 ('tender', 29117),
 ('fan', 28761),
 ('short', 28350),
 ('light', 28292),
 ('trip', 28068),
 ('stuff', 27901),
 ('expect', 27816),
 ('hawaiian', 27765),
 ('warm', 27558),
 ('curry', 27524),
 ('authentic', 27353),
 ('reason', 27288),
 ('brunch', 27033),
 ('glass', 26869),
 ('pick', 26865),
 ('slow', 26849),
 ('birthday', 26581),
 ('season', 26445),
 ('guy', 26391),
 ('white', 26267),
 ('chocolate', 26059),
 ('deal', 26033),
 ('cake', 25913),
 ('fine', 25816),
 ('fill', 25700),
 ('average', 25664),
 ('expensive', 25584),
 ('helpful', 25535),
 ('vietnamese', 25482),
 ('week', 25448),
 ('real', 25291),
 ('offer', 25235),
 ('reservation', 24991),
 ('rib', 24875),
 ('reasonable', 24840),
 ('horrible', 24764),
 ('casino', 24743),
 ('butter', 24703),
 ('cut', 24701),
 ('end', 24640),
 ('regular', 24633),
 ('wow', 24603),
 ('cook', 24580),
 ('variety', 24518),
 ('choice', 24175),
 ('forget', 24029),
 ('business', 24020),
 ('tuna', 23998),
 ('burrito', 23962),
 ('hungry', 23845),
 ('toast', 23833),
 ('guess', 23713),
 ('cheap', 23360),
 ('list', 23313),
 ('main', 23278),
 ('owner', 23265),
 ('talk', 23228),
 ('broth', 23009),
 ('bland', 23003),
 ('bartender', 22823),
 ('lady', 22767),
 ('medium', 22748),
 ('noodle', 22710),
 ('beautiful', 22602),
 ('add', 22564),
 ('hostess', 22519),
 ('rude', 22382),
 ('salty', 22294),
 ('stop', 22279),
 ('mind', 22275),
 ('make', 22255),
 ('door', 22232),
 ('glad', 22208),
 ('view', 22127),
 ('receive', 22061),
 ('street', 21745),
 ('year', 21695),
 ('mac', 21640),
 ('mouth', 21485),
 ('stand', 21449),
 ('terrible', 21420),
 ('corn', 21406),
 ('las', 21380),
 ('soft', 21314),
 ('ready', 21241),
 ('set', 21147),
 ('easy', 21036),
 ('local', 21002),
 ('dip', 20979),
 ('plenty', 20855),
 ('side', 20697),
 ('turn', 20484),
 ('remember', 20399),
 ('park', 20382),
 ('entire', 20243),
 ('salt', 20180),
 ('hit', 20129),
 ('onion', 20072),
 ('dress', 20069),
 ('crowd', 19988),
 ('charge', 19714),
 ('fact', 19676),
 ('share', 19675),
 ('ordered', 19547),
 ('literally', 19519),
 ('base', 19502),
 ('hand', 19415),
 ('ago', 19371),
 ('black', 19273),
 ('totally', 19000),
 ('rest', 18831),
 ('honestly', 18810),
 ('crave', 18742),
 ('job', 18672),
 ('simple', 18618),
 ('man', 18557),
 ('option', 18514),
 ('dog', 18399),
 ('juicy', 18343),
 ('min', 18236),
 ('write', 18234),
 ('outstanding', 18123),
 ('early', 17862),
 ('club', 17700),
 ('type', 17637),
 ('belly', 17581),
 ('hash', 17314),
 ('patio', 17289),
 ('life', 17220),
 ('read', 17127),
 ('spice', 17090),
 ('problem', 17020),
 ('entree', 16991),
 ('tomato', 16963),
 ('head', 16826),
 ('counter', 16792),
 ('die', 16709),
 ('loud', 16653),
 ('understand', 16646),
 ('slice', 16560),
 ('greek', 16535),
 ('spend', 16433),
 ('sausage', 16314),
 ('filet', 16280),
 ('piece', 16254),
 ('weekend', 16254),
 ('level', 16236),
 ('completely', 16197),
 ('know', 16195),
 ('include', 16152),
 ('incredible', 16136),
 ('unique', 15989),
 ('stick', 15807),
 ('rare', 15756),
 ('girl', 15696),
 ('pad', 15585),
 ('eaten', 15532),
 ('ethnic', 15455),
 ('duck', 15344),
 ('wrap', 15308),
 ('top', 15258),
 ('lamb', 15250),
 ('avocado', 15124),
 ('speak', 15121),
 ('card', 15113),
 ('pleasant', 15042),
 ('change', 15027),
 ('prepare', 14980),
 ('smoke', 14937),
 ('mediocre', 14926),
 ('floor', 14880),
 ('minute', 14831),
 ('cute', 14807),
 ('indian', 14557),
 ('bomb', 14546),
 ('melt', 14539),
 ('chili', 14511),
 ('low', 14496),
 ('game', 14489),
 ('save', 14443),
 ('crust', 14389),
 ('excite', 14376),
 ('split', 14356),
 ('perfection', 14353),
 ('hope', 14352),
 ('spanish', 14250),
 ('prime', 14213),
 ('delivery', 14201),
 ('buy', 14188),
 ('lettuce', 14125),
 ('dim', 14035),
 ('fried', 13860),
 ('accommodate', 13854),
 ('poor', 13654),
 ('presentation', 13638),
 ('oil', 13621),
 ('tonight', 13606),
 ('generous', 13590),
 ('sour', 13569),
 ('provide', 13566),
 ('creamy', 13523),
 ('meet', 13522),
 ('attention', 13516),
 ('satisfy', 13495),
 ('cup', 13460),
 ('lemon', 13419),
 ('forward', 13369),
 ('cost', 13292),
 ('item', 13285),
 ('bottle', 13273),
 ('pepper', 13242),
 ('establishment', 13242),
 ('cocktail', 13237),
 ('dirty', 13234),
 ('shake', 13192),
 ('bake', 13190),
 ('solid', 13186),
 ('healthy', 13095),
 ('joint', 13085),
 ('even', 13045),
 ('tofu', 13018),
 ('dine', 12985),
 ('phone', 12930),
 ('texture', 12830),
 ('sum', 12769),
 ('chance', 12728),
 ('gem', 12702),
 ('crazy', 12695),
 ('rush', 12640),
 ('waste', 12633),
 ('vegetarian', 12599),
 ('smile', 12492),
 ('pot', 12471),
 ('poke', 12439),
 ('simply', 12435),
 ('smell', 12383),
 ('mention', 12378),
 ('single', 12332),
 ('total', 12323),
 ('son', 12280),
 ('benedict', 12223),
 ('traditional', 12215),
 ('shop', 12210),
 ('attitude', 12202),
 ('mix', 12193),
 ('orange', 12186),
 ('suppose', 12052),
 ('basically', 12044),
 ('bun', 12026),
 ('wall', 12008),
 ('city', 11959),
 ('standard', 11944),
 ('blue', 11929),
 ('truffle', 11906),
 ('space', 11843),
 ('grab', 11818),
 ('management', 11813),
 ('mixed', 11812),
 ('margarita', 11791),
 ('spinach', 11777),
 ('hear', 11698),
 ('typical', 11692),
 ('eye', 11680),
 ('rich', 11644),
 ('pie', 11596),
 ('break', 11591),
 ('fat', 11575),
 ('middle', 11499),
 ('touch', 11462),
 ('notch', 11407),
 ('gravy', 11301),
 ('store', 11301),
 ('prefer', 11277),
 ('greasy', 11275),
 ('idea', 11205),
 ('usual', 11201),
 ('sat', 11161),
 ('tiny', 11130),
 ('mushroom', 11127),
 ('sign', 11103),
 ('sad', 11076),
 ('banana', 11048),
 ('cover', 11037),
 ('note', 11015),
 ('spring', 10990),
 ('homemade', 10989),
 ('issue', 10987),
 ('casual', 10923),
 ('brisket', 10843),
 ('weird', 10821),
 ('easily', 10818),
 ('dark', 10790),
 ('daughter', 10750),
 ('rating', 10747),
 ('barely', 10735),
 ('downtown', 10717),
 ('deep', 10702),
 ('environment', 10694),
 ('treat', 10689),
 ('lose', 10687),
 ('opinion', 10686),
 ('fruit', 10659),
 ('cashier', 10658),
 ('bone', 10647),
 ('surprise', 10600),
 ('face', 10546),
 ('comfortable', 10536),
 ('fancy', 10499),
 ('box', 10441),
 ('rock', 10358),
 ('desert', 10322),
 ('heat', 10308),
 ('host', 10306),
 ('cheesecake', 10301),
 ('sea', 10278),
 ('fairly', 10269),
 ('number', 10219),
 ('young', 10207),
 ('honey', 10207),
 ('turkey', 10187),
 ('longer', 10186),
 ('station', 10113),
 ('exceptional', 10106),
 ('hubby', 10098),
 ('dont', 10082),
 ('figure', 10069),
 ('bean', 10048),
 ('ambience', 10045),
 ('fair', 9971),
 ('crunchy', 9966),
 ('pudding', 9912),
 ('fabulous', 9894),
 ('throw', 9891),
 ('mall', 9883),
 ('book', 9869),
 ('soggy', 9858),
 ('beat', 9774),
 ('brown', 9748),
 ('awful', 9679),
 ('brazilian', 9666),
 ('phenomenal', 9659),
 ('thought', 9647),
 ('multiple', 9611),
 ('milk', 9599),
 ('dining', 9586),
 ('forever', 9567),
 ('york', 9533),
 ('nicely', 9522),
 ('disappointing', 9464),
 ('prepared', 9459),
 ('juice', 9420),
 ('raw', 9411),
 ('strong', 9385),
 ('double', 9376),
 ('affordable', 9373),
 ('lovely', 9315),
 ('strawberry', 9293),
 ('professional', 9172),
 ('case', 9168),
 ('damn', 9167),
 ('plain', 9154),
 ('original', 9144),
 ('polite', 9119),
 ('combination', 9107),
 ('matter', 9091),
 ('disappointment', 9074),
 ('true', 9053),
 ('complimentary', 9030),
 ('ahead', 9022),
 ('classic', 8994),
 ('cuisine', 8985),
 ('complaint', 8907),
 ('mango', 8837),
 ('play', 8808),
 ('crisp', 8795),
 ('hate', 8790),
 ('heavy', 8759),
 ('frozen', 8753),
 ('miss', 8751),
 ('chewy', 8697),
 ('normal', 8690),
 ('month', 8673),
 ('al', 8661),
 ('sticky', 8644),
 ('dance', 8635),
 ('booth', 8631),
 ('chop', 8613),
 ('kick', 8604),
 ('boy', 8591),
 ('lack', 8584),
 ('parking', 8564),
 ('welcome', 8547),
 ('st', 8537),
 ('shot', 8522),
 ('moist', 8502),
 ('center', 8476),
 ('hidden', 8463),
 ('catch', 8433),
 ('popular', 8426),
 ('sugar', 8380),
 ('window', 8346),
 ('chain', 8336),
 ('modern', 8291),
 ('bunch', 8287),
 ('baby', 8281),
 ('knowledgeable', 8271),
 ('incredibly', 8261),
 ('sort', 8259),
 ('refill', 8233),
 ('diner', 8226),
 ('spaghetti', 8211),
 ('happen', 8194),
 ('grand', 8168),
 ('plan', 8138),
 ('tough', 8129),
 ('post', 8128),
 ('omelette', 8121),
 ('afternoon', 8113),
 ('sister', 8104),
 ('soda', 8098),
 ('coconut', 8089),
 ('spectacular', 8088),
 ('yeah', 8080),
 ('avoid', 8077),
 ('personally', 8034),
 ('corner', 8032),
 ('request', 8031),
 ('agree', 8000),
 ('funny', 7978),
 ('round', 7969),
 ('overly', 7950),
 ('boba', 7948),
 ('sport', 7944),
 ('mood', 7933),
 ('mistake', 7840),
 ('continue', 7826),
 ('disgust', 7821),
 ('platter', 7816),
 ('hold', 7815),
 ('picture', 7803),
 ('alcohol', 7802),
 ('limit', 7786),
 ('word', 7755),
 ('refresh', 7720),
 ('complain', 7712),
 ('school', 7710),
 ('interior', 7707),
 ('soy', 7705),
 ('pas', 7691),
 ('basic', 7681),
 ('el', 7659),
 ('explain', 7649),
 ('ridiculous', 7648),
 ('bag', 7646),
 ('surprisingly', 7641),
 ('sake', 7615),
 ('future', 7600),
 ('sell', 7584),
 ('pancake', 7576),
 ('pool', 7565),
 ('ranch', 7561),
 ('par', 7550),
 ('gluten', 7504),
 ('omelet', 7501),
 ('waffle', 7492),
 ('addition', 7456),
 ('conversation', 7436),
 ('personal', 7421),
 ('efficient', 7407),
 ('bathroom', 7397),
 ('balance', 7384),
 ('pineapple', 7382),
 ('ham', 7365),
 ('tortilla', 7357),
 ('step', 7323),
 ('handle', 7315),
 ('didnt', 7283),
 ('lounge', 7265),
 ('state', 7253),
 ('outdoor', 7233),
 ('neighborhood', 7230),
 ('goodness', 7230),
 ('superb', 7195),
 ('heart', 7190),
 ('moment', 7178),
 ('alright', 7177),
 ('clear', 7166),
 ('hair', 7152),
 ('car', 7150),
 ('mignon', 7128),
 ('sick', 7123),
 ('stomach', 7122),
 ('mess', 7102),
 ('savory', 7093),
 ('taiwanese', 7079),
 ('country', 7076),
 ('gross', 7066),
 ('quiet', 7053),
 ('chill', 7035),
 ('island', 7025),
 ('convenient', 7013),
 ('want', 7004),
 ('ton', 6968),
 ('prompt', 6960),
 ('credit', 6955),
 ('eggplant', 6943),
 ('signature', 6923),
 ('british', 6922),
 ('eating', 6922),
 ('buffalo', 6900),
 ('straight', 6864),
 ('basil', 6864),
 ('famous', 6847),
 ('run', 6842),
 ('burnt', 6818),
 ('general', 6809),
 ('lucky', 6805),
 ('skip', 6794),
 ('drop', 6779),
 ('bloody', 6774),
 ('olive', 6760),
 ('correct', 6757),
 ('consistent', 6754),
 ('cucumber', 6738),
 ('employee', 6711),
 ('separate', 6696),
 ('octopus', 6695),
 ('syrup', 6676),
 ('enjoyable', 6670),
 ('negative', 6648),
 ('team', 6629),
 ('guest', 6599),
 ('interest', 6565),
 ('memorable', 6564),
 ('absolute', 6559),
 ('actual', 6556),
 ('broccoli', 6556),
 ('heaven', 6553),
 ('wellington', 6511),
 ('asparagus', 6505),
 ('flat', 6477),
 ('anniversary', 6474),
 ('cash', 6465),
 ('version', 6457),
 ('gyro', 6447),
 ('pita', 6414),
 ('bay', 6385),
 ('recommendation', 6383),
 ('bass', 6378),
 ('lime', 6341),
 ('honest', 6330),
 ('chow', 6328),
 ('celebrate', 6322),
 ('african', 6319),
 ('apple', 6317),
 ('picky', 6290),
 ('paper', 6283),
 ('batter', 6275),
 ('pleasantly', 6274),
 ('king', 6237),
 ('difficult', 6230),
 ('additional', 6207),
 ('pastor', 6198),
 ('complete', 6174),
 ('pair', 6169),
 ('hole', 6152),
 ('patty', 6136),
 ('vegetable', 6122),
 ('fall', 6109),
 ('sense', 6100),
 ('impressive', 6095),
 ('vanilla', 6088),
 ('rio', 6088),
 ('san', 6068),
 ('awhile', 6054),
 ('skin', 6039),
 ('load', 6037),
 ('positive', 6036),
 ('ravioli', 6033),
 ('bother', 6014),
 ('coupon', 6007),
 ('stuffed', 5999),
 ('freshly', 5996),
 ('cozy', 5995),
 ('notice', 5994),
 ('bell', 5990),
 ('mary', 5986),
 ('yesterday', 5979),
 ('arrive', 5974),
 ('fluffy', 5957),
 ('woman', 5949),
 ('company', 5929),
 ('color', 5929),
 ('send', 5909),
 ('brought', 5902),
 ('mother', 5900),
 ('comfort', 5890),
 ('regret', 5877),
 ('assume', 5870),
 ('crunch', 5849),
 ('event', 5836),
 ('pop', 5799),
 ('update', 5787),
 ('concept', 5769),
 ('pan', 5766),
 ('imagine', 5763),
 ('roast', 5761),
 ('difference', 5757),
 ('build', 5755),
 ('range', 5738),
 ('sadly', 5728),
 ('discount', 5721),
 ('tapa', 5719),
 ('odd', 5684),
 ('shack', 5682),
 ('blow', 5679),
 ('dollar', 5670),
 ('yellowtail', 5662),
 ('hey', 5651),
 ('specialty', 5647),
 ('fell', 5647),
 ('upset', 5632),
 ('wear', 5628),
 ('enter', 5627),
 ('travel', 5626),
 ('ginger', 5612),
 ('doubt', 5580),
 ('register', 5579),
 ('yellow', 5573),
 ('romantic', 5555),
 ('realize', 5520),
 ('tap', 5515),
 ('pub', 5512),
 ('golden', 5501),
 ('sear', 5501),
 ('wide', 5488),
 ('entrance', 5482),
 ('learn', 5467),
 ('told', 5465),
 ('inform', 5459),
 ('fountain', 5456),
 ('previous', 5443),
 ('consistently', 5403),
 ('marinate', 5401),
 ('select', 5391),
 ('worst', 5377),
 ('greet', 5361),
 ('rave', 5358),
 ('luckily', 5353),
 ('sample', 5353),
 ('rate', 5337),
 ('impeccable', 5335),
 ('machine', 5328),
 ('nearby', 5319),
 ('upscale', 5318),
 ('god', 5318),
 ('veal', 5306),
 ('spent', 5296),
 ('mild', 5285),
 ('burn', 5239),
 ('prior', 5238),
 ('photo', 5238),
 ('valley', 5233),
 ('fatty', 5232),
 ('express', 5190),
 ('daily', 5167),
 ('tasteless', 5157),
 ('situation', 5152),
 ('wynn', 5128),
 ('east', 5112),
 ('caribbean', 5109),
 ('brother', 5108),
 ('bed', 5097),
 ('courteous', 5096),
 ('basket', 5078),
 ('fusion', 5071),
 ('muffin', 5065),
 ('drunk', 5064),
 ('ruin', 5058),
 ('garden', 5056),
 ('trust', 5056),
 ('crap', 5046),
 ('air', 5038),
 ('oxtail', 5019),
 ('giant', 5019),
 ('whip', 4993),
 ('chose', 4990),
 ('cheesy', 4986),
 ('circus', 4985),
 ('cinnamon', 4977),
 ('till', 4972),
 ('constantly', 4964),
 ('china', 4961),
 ('crepe', 4958),
 ('joke', 4957),
 ('buttery', 4926),
 ('pakistani', 4916),
 ('bright', 4916),
 ('lemonade', 4916),
 ('typically', 4902),
 ('compare', 4897),
 ('fee', 4888),
 ('receipt', 4879),
 ('feeling', 4873),
 ('win', 4858),
 ('patient', 4847),
 ('raise', 4845),
 ('private', 4840),
 ('seating', 4838),
 ('extensive', 4838),
 ('grow', 4832),
 ('protein', 4831),
 ('as', 4831),
 ('deliver', 4822),
 ('personable', 4821),
 ('goat', 4810),
 ('eats', 4794),
 ('age', 4789),
 ('design', 4783),
 ('interesting', 4779),
 ('snack', 4777),
 ('frequent', 4769),
 ('southern', 4765),
 ('chile', 4765),
 ('vacation', 4761),
 ('west', 4753),
 ('story', 4746),
 ('peanut', 4739),
 ('delightful', 4738),
 ('nasty', 4737),
 ('satisfied', 4727),
 ('gamble', 4726),
 ('irish', 4703),
 ('feed', 4696),
 ('favor', 4691),
 ('venue', 4664),
 ('ground', 4653),
 ('admit', 4653),
 ('jam', 4645),
 ('cart', 4643),
 ('fit', 4635),
 ('evening', 4609),
 ('desk', 4608),
 ('shame', 4597),
 ('south', 4595),
 ('mon', 4576),
 ('plastic', 4563),
 ('entertainment', 4559),
 ('search', 4554),
 ('sound', 4552),
 ('fault', 4547),
 ('answer', 4540),
 ('deserve', 4537),
 ('class', 4533),
 ('airport', 4526),
 ('oily', 4525),
 ('remove', 4520),
 ('acknowledge', 4517),
 ('left', 4503),
 ('pepperoni', 4501),
 ('dad', 4499),
 ('velvet', 4494),
 ('sesame', 4488),
 ('decorate', 4476),
 ('tower', 4473),
 ('appetite', 4465),
 ('pastry', 4464),
 ('vodka', 4460),
 ('creative', 4450),
 ('wise', 4448),
 ('move', 4437),
 ('watermelon', 4429),
 ('toffee', 4404),
 ('blend', 4387),
 ('closer', 4344),
 ('find', 4338),
 ('mein', 4337),
 ('beautifully', 4334),
 ('annoy', 4326),
 ('relax', 4324),
 ('question', 4313),
 ('mustard', 4306),
 ('consistency', 4298),
 ('pink', 4297),
 ('bottomless', 4296),
 ('tail', 4294),
 ('fare', 4290),
 ('highlight', 4288),
 ...]
In [13]:
# Make a wordcloud from the occurence of every word in the dataset
# Convert reviews to a dictionary of words with values and items the occurence of every word
word_could_dict = Counter(" ".join(Nevada['input_text']).split(" "))
word_could_dict

# Generate wordcloud
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False).generate_from_frequencies(word_could_dict)

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show()
In [14]:
# Save image
#wordcloud.to_file("wordcloud.png")
In [15]:
Nevada['month'] = Nevada['date'].dt.month
labels = 'January' , 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'
plot_months = Nevada.groupby([Nevada['month']], sort=True).count().eval('review_id')
In [16]:
sns.set(style="whitegrid")
# Create matplotlib figure
fig = plt.figure(figsize=(19,10),facecolor='#F0F0F0') 
# Create matplotlib axes
ax = fig.add_subplot(111)
ax.grid(color='grey', linestyle='-', linewidth=0.6, alpha=1.0)

# Create a sample dataframe with an text index
plotdata = pd.DataFrame(
    {"Reviews":np.stack(plot_months).astype(None) }, 
    index=['January' , 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'])
# Plot a bar chart

x1 = [0,1,2,3,4,5,6,7,8,9,10,11]
months=['January' , 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plotdata.plot(kind="line",color='#F2583E', ax=ax, fontsize=14)
plt.title("Reviews per Month", fontsize=16)
plt.xlabel("Months", fontsize=16)
plt.ylabel("Number of Reviews", fontsize=16)
ax.set_xticks(x1)
ax.set_xticklabels(months, minor=False, rotation=45, fontsize=16, horizontalalignment="center")
plt.grid(True)
#plt.savefig("month_plot2.png")
plt.show()
Exploration of Ratings
In [17]:
#Get the distribution of the ratings
x=Nevada['review_stars'].value_counts()

x=x.sort_index()

# Create matplotlib figure
fig = plt.figure(figsize=(18,10),facecolor='#F0F0F0') 
# Create matplotlib axes
ax = fig.add_subplot(111) 

colors = ( "#747E80", "#D5E1DD", "#F7F3E8", "#77BED2", "#F2583E") 
#plot
#plt.figure(figsize=(8,4))
ax.bar(x.index, x.values, alpha=1, color=colors)
plt.title("Star Rating Distribution", fontsize=16)
plt.ylabel('# of Reviews', fontsize=16)
plt.xlabel('Star Ratings ', fontsize=16)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=15)
    
plt.grid(False)    
plt.xticks(fontsize=18)
#plt.savefig("rating_barplot.png")
plt.show()
In [18]:
labels = 'Star 1' , 'Star 2', 'Star 3', 'Star 4', 'Star 5'
sizes = [116354,80257,112584,225805,478794]
explode = (0.1, 0.1, 0.1, 0.1, 0.1)

# Create matplotlib figure
fig = plt.figure(figsize=(18,10),facecolor='#F0F0F0') 
# Create matplotlib axes
ax = fig.add_subplot(111) 

# Creating color parameters 
colors = ( "#747E80", "#D5E1DD", "#F7F3E8", "#77BED2", "#F2583E") 

ax.pie(sizes,
       explode = explode,
       labels = labels,
       autopct = '%1.1f%%',
       shadow = False,
       startangle = 100,
       colors = colors)

ax.axis ('equal')
# The title of the plot
ax.set_title("Percentage Distribution of Review Stars", fontsize=15, pad=20)
#plt.savefig("stars_pie.png")
plt.show()